knitr::opts_chunk$set(echo = TRUE)
options(stringsAsFactors = FALSE)

library(tidyverse)
theme_set(theme_classic())
theme_update(plot.title = element_text(face="bold"))

library(cowplot)
library(DT)

library(class)
library(MASS)
library(randomForest)

library(DESeq2)

compute <- FALSE

Import data

data <- read.delim2("Species_Habit_Orthogroups.GeneCount.June23.tsv", sep = "\t") %>%
  filter(!(Species %in% c("Drimys", "Macint", "Vitvin", "Aratha")))

cat("Raw data: Species x OrthoGroups: ", paste0(dim(data), collapse = " x "))
## Raw data: Species x OrthoGroups:  32 x 36519
idx <- colSums(data > 0)  >= 5
data <- data[,idx]
cat("Ortholog in at least five species: Species x OrthoGroups: ", paste0(dim(data), collapse = " x "))
## Ortholog in at least five species: Species x OrthoGroups:  32 x 13779
# Gene information - orthogroups
orthogroups <- read.delim2("Orthogroups.tsv", sep = "\t", header = TRUE) %>%
  dplyr::select(Orthogroup, Poptre.SHORT.pep) %>%
  dplyr::rename(OrthoGroup = Orthogroup, Pt = Poptre.SHORT.pep) %>%
  filter(Pt != "") %>%
  arrange(OrthoGroup) %>%
  separate_rows(Pt, sep = ", ") %>%
  separate(Pt, into = c("Pt"), sep = "\\.", extra = "drop") %>%
  mutate(Pt = gsub("Poptre_", "", Pt))


gene.info <- read.delim2("gene_info.txt") %>%
  dplyr::rename(Pt = gene_id, Descr = description) %>%
  dplyr::select(Pt, Descr)

orthogroups <- left_join(orthogroups, gene.info, "Pt") %>%
  mutate(Descr = gsub("\\S+\\|\\S+\\|\\S+\\s", "", Descr)) %>%
  mutate(Descr = gsub(" OS=.+$", "", Descr))

Differential analysis

Use a statistical test to identify ortholog groups with differences in copy numbers between woody and herbaceous growth form.

DOGs: Differential Ortholog Groups

dds <- DESeqDataSetFromMatrix(countData = t(data[, 3:ncol(data)]),
                            colData = DataFrame(condition = as.factor(data$Habit)),
                            formula(~ condition))

dds <- DESeq(dds)

degres <- results(dds) %>%
  as.data.frame %>%
  rownames_to_column(var = "OrthoGroups") %>%
  drop_na(pvalue) %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  arrange(padj)

nDOGs <- nrow(degres %>% filter(padj <= 0.1))
cat("DOGs: ", nDOGs, "\n")
## DOGs:  26
degres <- degres %>%
  dplyr::select(-baseMean, -lfcSE, -pvalue) %>%
  filter(padj <= 0.1) %>%
  mutate(log2FoldChange = round(log2FoldChange, digits = 3)) %>%
  mutate(stat = round(stat, digits = 3)) %>%
  mutate(padj = format(padj, digits = 3, scientific = TRUE))

datatable(degres, rownames = FALSE, filter = "top",
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = "_all"))
            )
          )
if (FALSE) {
  plots <- list()
  for (i in 1:nDOGs) {
  
    orth <- sym(degres$OrthoGroups[i])
    p <- degres$padj[i]
    
    plots[[i]] <- ggplot(data, aes(x = Habit, y = !!orth, col = Habit)) + 
      theme_bw() + theme() +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0) +
      geom_text(x=1.5, y=max(data[[degres$OrthoGroups[i]]]), col = "black", 
                label=paste("p = ", p))
  }
  
  plot_grid(plotlist = plots, ncol = 3)
}

Random forest

Use machine learning to identify ortholog groups with differences in copy numbers between woody and herbaceous growth form.

SOGs: Significant Ortholog Groups

set.seed(3)

# Random forest
if (compute) {

  fit <-  randomForest(Habit ~ ., data=cbind(data[, 3:ncol(data)], 
                                           data.frame(Habit = as.factor(data$Habit))), 
                     importance=TRUE, proximity=TRUE,
                     ntree=1000)
  
  confussion.matrix <- table(data$Habit, fit$predicted)
  accuracy <- sum(data$Habit == fit$predicted)/nrow(data)
  
  feature.importance <- as.data.frame(importance(fit))

  # Randomize
  idx <- sample(1:nrow(data), nrow(data), replace = FALSE)
  fit.rand <-  randomForest(Habit ~ ., data=cbind(data[, 3:ncol(data)], 
                                           data.frame(Habit = as.factor(data$Habit[idx]))), 
                     importance=TRUE, proximity=TRUE,
                     ntree=1000)
  
  table(data$Habit[idx], fit.rand$predicted)
  sum(data$Habit[idx] == fit.rand$predicted)/nrow(data)
  
  rand.scores <- importance(fit.rand)[,"MeanDecreaseGini"]
  
  # Save
  save(confussion.matrix, accuracy, feature.importance, rand.scores, file = "RData/rf_randomize.RData")
} else {
  load(file = "RData/rf_randomize-without-outgroups.RData")
}

confussion.matrix
##     
##      HA WP
##   HA  4  7
##   WP  1 20
accuracy
## [1] 0.75
feature.importance <- feature.importance %>%
  rownames_to_column(var = "OrthoGroups") %>%
  arrange(desc(MeanDecreaseGini)) %>%
  filter(MeanDecreaseGini > 0) %>%
  rowwise() %>%
  mutate(pvalue = (1+sum(rand.scores >= MeanDecreaseGini))/(length(rand.scores)+1)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH"))

nSOGs <- nrow(feature.importance %>% filter(padj <= 0.1))
cat("SOGs: ", nSOGs, "\n")
## SOGs:  101
feature.importance <- feature.importance %>%
  dplyr::select(-MeanDecreaseAccuracy, -pvalue) %>%
  filter(padj <= 0.1) %>%
  mutate(HA = round(HA, digits = 3)) %>%
  mutate(WP = round(WP, digits = 3)) %>%
  mutate(MeanDecreaseGini = round(MeanDecreaseGini, digits = 3)) %>%
  mutate(padj = format(padj, digits = 3, scientific = TRUE))

datatable(feature.importance, 
          rownames = FALSE, filter = "top",
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = "_all"))
            )
          )
if (FALSE) {
  plots <- list()
  for (i in 1:nSOGs) {
  
    orth <- sym(feature.importance$OrthoGroups[i])
    
    plots[[i]] <- ggplot(data, aes(x = Habit, y = !!orth, col = Habit)) + 
      theme_bw() + theme() +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0)
  }
  
  plot_grid(plotlist = plots, ncol = 3)
}

AspWood expression

Ortholog groups identified by one of the two methods:

Box plots show copy numbers in the two growth forms

Line plots show AspWood expression of the most highly expressed aspen gene in the ortholog group.

OBS: Expression plots for all aspen genes can be found here.

candidate_orthogroups <- intersect(degres$OrthoGroups, feature.importance$OrthoGroups)
cat("Intersection DOGs and SOGs:", length(candidate_orthogroups))
## Intersection DOGs and SOGs: 16
candidate_orthogroups <- rbind(degres %>% dplyr::select(OrthoGroups, padj),
                               feature.importance %>% dplyr::select(OrthoGroups, padj)) %>%
  arrange(as.numeric(padj)) %>%
  distinct(OrthoGroups) %>%
  pull(OrthoGroups)

aspwood_expr <- read.delim("../../DATA/AspWood_transcriptomics_Ptremula.txt", 
                           header = TRUE, sep = "\t") %>%
  dplyr::rename(Pt = Genes) %>%
  gather (Samples, Expression, -1) %>%
  filter (Pt %in% orthogroups$Pt) %>%
  separate(Samples, into = c("Trees", "Samples"), sep = "\\.") %>%
  mutate_at("Samples", as.numeric) %>%
  mutate(Expression = log2(Expression+1))

#plot(density(aspwood_expr$Expression), xlab = "log2(TPM)")

aspwood_expr %>%
  group_by(Pt) %>%
  summarise(Max = max(Expression)) -> max_expr

d <- tibble(Trees = paste0("T", c(rep(1,3),rep(2,3),rep(3,3),rep(4,3))), 
           xintercept = c(5.5, 12.5, 19.5, 5.5, 11.5, 19.5, 5.5, 14.5, 21.5, 5.5, 12.5, 20.5))

plots <- list()
for (orth in candidate_orthogroups) {
  
  candidates_pt <- orthogroups %>%
    filter(OrthoGroup == orth) %>%
    left_join(max_expr, by = c("Pt")) %>%
    arrange(desc(Max)) %>% 
    dplyr::slice(1)
  
  if (nrow(candidates_pt) > 0) { # No Pt gene in orthogroup?
  
    orth_sym <- sym(orth)
    plots[[length(plots)+1]] <- ggplot(data, aes(x = Habit, y = !!orth_sym, col = Habit)) + 
      theme_bw() + theme() +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0)
      
    descr <- candidates_pt$Descr
    candidates_pt <- candidates_pt$Pt  
  
    plots[[length(plots)+1]] <- aspwood_expr %>% filter(Pt == candidates_pt) %>% 
      ggplot(aes(x = Samples, y = Expression, col = Trees)) +
      geom_line() +
      geom_vline(data = d, mapping = aes(xintercept = xintercept), linetype="dashed", 
                 color = "gray68", linewidth=0.5) +
      facet_grid(cols = vars(Trees)) + 
      theme(legend.position = "none") +
      ggtitle(label = paste(orth, " - ", candidates_pt, sep = ""), 
              subtitle = descr)
  }
      
}

plot_grid(plotlist = plots, ncol = 2)

for (orth in candidate_orthogroups) {

  plots <- list()
  
  orth_sym <- sym(orth)
  plots[[length(plots)+1]] <- ggplot(data, aes(x = Habit, y = !!orth_sym, col = Habit)) + 
    theme_bw() + theme() +
    geom_boxplot(outlier.shape = NA) +
    geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0)
  
  candidates_pt <- orthogroups[orthogroups$OrthoGroup == orth,]$Pt
  candidates_pt <- candidates_pt[candidates_pt %in% aspwood_expr$Pt]
  
  candidates_pt <- max_expr %>% filter(Pt %in% candidates_pt) %>% 
    arrange(desc(Max))
  candidates_pt <- candidates_pt$Pt
  
  if (length(candidates_pt) > 0) {
    for (i in 1:length(candidates_pt)) {
      
      descr <- orthogroups[orthogroups$Pt == candidates_pt[i],]$Descr
      
      plots[[length(plots)+1]] <- aspwood_expr %>% filter(Pt == candidates_pt[i]) %>% 
        ggplot(aes(x = Samples, y = Expression, col = Trees)) +
        geom_line() +
        geom_vline(data = d, mapping = aes(xintercept = xintercept), linetype="dashed", 
                   color = "gray68", linewidth=0.5) +
        facet_grid(cols = vars(Trees)) + 
        theme(legend.position = "none") +
        ggtitle(label = paste(orth, " - ", candidates_pt[i], sep = ""), 
                subtitle = descr)
      
    }
  }
  
  pdf(paste0("figs/", orth, ".pdf"), height = length(candidates_pt)*2)
  print(plot_grid(plotlist = plots, ncol = 2))
  dev.off()
}